# Necessary Imports
import pandas as pd
import dtale as dt
import numpy as np
import plotly.express as pl
import matplotlib.pyplot as plt
import seaborn as sns
from bokeh.plotting import figure,show,output_notebook
output_notebook()
# Model Related
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import SGDClassifier;
from sklearn.kernel_approximation import RBFSampler;
# Evaluation metrics
from sklearn.metrics import accuracy_score,balanced_accuracy_score,cohen_kappa_score,roc_auc_score,plot_confusion_matrix,plot_roc_curve,classification_report,confusion_matrix
# if there is imbalance in classes
from imblearn.over_sampling import SMOTE
# Model Tuning Engines
# Model Tuning Engine's and Validation Engine(Final)
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,cross_val_score,StratifiedKFold,cross_val_predict
# Deploy Model to a file in binary format
import joblib as jb
plt.style.use('seaborn-whitegrid');
# Importing the dataset
dataset_main = pd.read_csv('train.csv')
# Gathering information about the shape of the dataset
print(f"The No of rows is :{dataset_main.shape[0]} \nThe No Features is :{dataset_main.shape[-1]}")
# Quite a big dataset lets gather the head of the dataset
dataset_main.head(8)
# Lets drop the id columns since its not necassary (we already have index)
dataset_main.drop('id',axis=1,inplace=True)
dataset_main.head(2)
# Features of the dataset
from termcolor import colored
print(colored("Feature Columns",color='blue'))
for i,j in enumerate(dataset_main.columns):
print(i+1,j)
# Lets get the information of the dataset
dataset_main.info()
# Summary Statistics of the dataset
dataset_main.describe()
# Checking for null values
dataset_main.isna().sum()
# Check of class imbalance
print("Class Distributions")
print()
print(dataset_main.Response.value_counts())
print()
print(f"Event Rate of Minority class(To be predicted): {(46710/381109)*100:.2f}%")
# Lets check customers vechile damage based on vehicle age
plt.figure(figsize=(5,4))
sns.countplot(y=dataset_main.Vehicle_Age,linewidth=1.6,edgecolor='black',hue=dataset_main.Vehicle_Damage)
plt.title("Vehicle Damage Count based on vehicle age");
# Histogram plot
ax = dataset_main.hist(figsize=(10,8),bins=23);
plt.suptitle("Summary Stats",fontsize=30);
# Seeing Category distribution
sns.countplot(dataset_main.Gender);
plt.title("Gender Count ");
sns.countplot(dataset_main.Vehicle_Age)
plt.title("Vehicle Age Count");
# Seeing Distribution based on response
fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(8,6),sharey=True)
sns.countplot(dataset_main.Gender,hue=dataset_main.Response,ax=ax[0],palette='rainbow')
sns.countplot(dataset_main.Vehicle_Age,hue=dataset_main.Response,ax=ax[1],palette='coolwarm')
ax[0].set_ylabel("");
ax[1].set_ylabel("");
fig.suptitle(" Category Distribution Based On Response",fontsize=25,horizontalalignment='center');
fig,ax = plt.subplots(nrows=1,ncols=3,figsize=(8,6))
sns.boxplot(dataset_main.Age,ax=ax[0],orient='v',color='r')
sns.boxplot(dataset_main.Annual_Premium,ax=ax[1],orient='v',color='blue')
sns.boxplot(dataset_main.Vintage,ax=ax[2],orient='v',color='g')
plt.tight_layout()
plt.suptitle("Age -- Premimum -- Vintage",y=1.08,fontsize=20);
# for plotting purposes
No_response = dataset_main[dataset_main['Response']==0]
response = dataset_main[dataset_main['Response']==1]
m,c = np.polyfit(dataset_main['Age'],dataset_main['Vintage'],1)
q = m*dataset_main['Age']+c
# Imbalance Determination
from bokeh.models import ColumnDataSource,CategoricalColorMapper
from bokeh.layouts import gridplot
source = ColumnDataSource(dict(x=[d for d in dataset_main.Age],y=[d for d in dataset_main.Vintage],label=[str(d) for d in dataset_main.Response]))
color_map = CategoricalColorMapper(factors=['0','1'],palette=['red','blue'])
p = figure(plot_width=500,plot_height=500,title='Class imbalance',tools="")
p.circle(x='x',y='y',source=source,color={'field':'label','transform':color_map},legend='label')
d = figure(plot_width=200,plot_height=200,title='Regression Curve')
show(p)
plt.title("Correlation Curve(Regline)")
sns.regplot(dataset_main['Age'][:20],q[:20]);
plt.ylabel("");
# Correlation and residual variation between Age and premium amount
fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(8,6),sharey=False)
sns.regplot(dataset_main['Age'][:200],dataset_main.Annual_Premium[:200],ax=ax[0],line_kws={'color':'r'})
sns.residplot(dataset_main['Age'][:200],dataset_main.Annual_Premium[:200],ax=ax[1])
plt.suptitle("Correlation and residual variation between Age and premium amount",y=1.03,fontsize=20,horizontalalignment='center')
plt.tight_layout()
# Ratio of people Previoulsy insured and who are not:
R1,R2 = dataset_main['Previously_Insured'].value_counts()[0]/len(dataset_main),dataset_main['Previously_Insured'].value_counts()[1]/len(dataset_main)
plt.figure(figsize=(6,6))
ax = plt.pie([R1,R2],labels=['Not Insured','Insured'],explode=[0.02,0.03],autopct="%.0f%%",shadow=True,);
plt.title("Ratio of people Previoulsy insured and who are not",fontsize=20);
# Lets goo deep and consider how does vehicle damage in the past impact on annual premimum
unique_values = ['No','Yes']
mean_1 = dataset_main[dataset_main['Vehicle_Damage']=='No']['Annual_Premium'].mean()
mean_2 = dataset_main[dataset_main['Vehicle_Damage']=='Yes']['Annual_Premium'].mean()
alfa = figure(x_range=unique_values,plot_width=452,plot_height=300,tools="",toolbar_location=None,title="Average Premimum to be paid based on previous Vehicle damage")
alfa.vbar(x=unique_values,top=[mean_1,mean_2],width=0.5)
show(alfa)
# No of days with the company and Annual premimum
p=sns.jointplot(x=dataset_main['Vintage'][:1000],y=dataset_main['Annual_Premium'][:1000],kind='hex',color='red',edgecolor='black')
p.fig.suptitle("Vintage Vs Premimum",y=1.03,fontsize=20,horizontalalignment='left')
sns.jointplot(x=dataset_main['Vintage'][:1000],y=dataset_main['Annual_Premium'][:1000],kind='reg',color='blue');
# Check the same for age and Premimum'
p=sns.jointplot(x=dataset_main['Age'][:1000],y=dataset_main['Annual_Premium'][:1000],kind='hex',color='orange',edgecolor='black')
p.fig.suptitle("Age Vs premimum",y=1.03,fontsize=20,horizontalalignment='left')
sns.jointplot(x=dataset_main['Age'][:1000],y=dataset_main['Annual_Premium'][:1000],kind='reg',color='green');
# Correlation plot
plt.figure(figsize=(8,6))
ax = sns.heatmap(dataset_main.corr(),annot=True,cmap='viridis',linecolor='black',linewidth=2)
plt.title("Correlation plot",fontsize=20,y=1.01,)
bottom,top = ax.get_ylim()
ax.set_ylim(bottom+0.5,top-0.5);
# Lets Check whether correlation is true for one entry as per heat map above
from scipy import stats
print(f"The Correlation value is:{stats.pearsonr(dataset_main.Age,dataset_main.Annual_Premium)[0]} \nThe P-value is {stats.pearsonr(dataset_main.Age,dataset_main.Annual_Premium)[1]}")
Multidimensional Analysis¶# Taking the top22 respones region code from the dataframe
top22 = dataset_main.Region_Code.value_counts().index[:10]
indexs = []
for i in range(len(dataset_main)):
for j in top22:
if dataset_main.at[i,'Region_Code']==j:
indexs.append(i)
# Plotting the Requied
newFrame = dataset_main.iloc[indexs,:]
plt.figure(figsize=(10,6))
sns.countplot(newFrame['Region_Code'],hue=newFrame["Response"],palette='inferno');
plt.title("Responses Based on 10 Regions",fontsize=20);
sns.set_style("white");
# NOt insured customers data
plt.style.use("ggplot")
not_insured = dataset_main[dataset_main["Previously_Insured"]==0]
# Insured customers data
fig,ax = plt.subplots(nrows=1,ncols=2,figsize=(6,5))
insured = dataset_main[dataset_main["Previously_Insured"]==1]
sns.countplot(not_insured['Response'],ax=ax[0])
sns.countplot(insured['Response'],ax=ax[1])
ax[0].set_title("Not Insured Previously",y=1.01)
ax[1].set_title("Insured Previously",y=1.01)
plt.tight_layout()
Note a intresting thing 158 customers who have insured already have shown a postive response two things can be observed from this
Let us consider the second scenario for the time being
# just for fun time series but Vintage (doesnt make any sense anyhow lets see)
plt.style.use("seaborn-whitegrid")
plt.figure(figsize=(20,6))
plt.plot(dataset_main['Vintage'][:200],c='green',marker='.',markersize=10,markerfacecolor='red',markeredgecolor='black');
plt.hlines(y=148,xmin=0,xmax=200,linestyles='dashdot',linewidth=5)
plt.vlines(x=148,ymin=0,ymax=300,linestyles='dashdot',linewidth=5)
plt.annotate(s="AVERAGE POINT",xy=(148,148),c='red',fontsize=20)
plt.title("Vintage as One dimensional Time series(Looks like)",fontsize=20);
# Annual premium distributed based on vehicle age
plt.figure(figsize=(6,5))
sns.violinplot(dataset_main['Vehicle_Age'],dataset_main['Annual_Premium'])
plt.title("Annual premium distributed based on vehicle age",fontsize=20,y=1.02);
dataset_main.head()
# Top 10 Policy channel based on response
top_10_channel = dataset_main.Policy_Sales_Channel.unique()[:10]
index=[]
for i in range(len(dataset_main)):
for j in top_10_channel:
if dataset_main.at[i,"Policy_Sales_Channel"]==j:
index.append(i)
newframe2 = dataset_main.iloc[index,:]
plt.figure(figsize=(8,5))
sns.countplot(newframe2['Policy_Sales_Channel'],hue=dataset_main["Response"]);
plt.title("Responses Based on Top 10 Channel",fontsize=20);
plt.ylabel("");
# Since we gave imbalancing in the dataset we can do smote oversampling technique to handle this before moving on lets encode the categorical varialbles
# Storing backup
dataset_main2 = dataset_main.copy()
dataset_main2.head(5)
# Lets encode Gender Vehicle Age,Damage
for i in dataset_main2.columns:
if pd.api.types.is_string_dtype(dataset_main2[i]) or pd.api.types.is_object_dtype(dataset_main2[i]):
dataset_main2[i] = pd.Categorical(dataset_main2[i]).codes
dataset_main2.head()
# Encoded Dataset
dataset_main2.info()
# Creating Datastructures
X = dataset_main2.iloc[:,:-1].values
y = dataset_main2.iloc[:,-1].values
#Oversampling to reduce class imbalance
sm = SMOTE()
X,y = sm.fit_resample(X,y)
# Before smote
pl.scatter(dataset_main,x='Age',y='Vintage',color='Response')
# After smote
pl.scatter(x=X[:,1],y=X[:,9],color=y)